load packages

library(ggplot2)
library(data.table)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
data.table 1.14.2 using 1 threads (see ?getDTthreads).  Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********
library(stringr)
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:data.table’:

    between, first, last

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(tibble)
library(here)
here() starts at /Users/u241374/mike_tisza/teddy_virome/teddy_vir_bac_marker_geneR
library(rprojroot)
library(ggpubr)
library(rstatix)

Attaching package: ‘rstatix’

The following object is masked from ‘package:stats’:

    filter

set paths and filenames

### files
long_table=sprintf("%s/data/mp142_TGVG1.1_MPA4_combined_abundance_table_longform1.tsv", find_rstudio_root_file())
metadata_table=sprintf("%s/data/some_teddy_MP142_metadata2.all_samples1.delivery.csv", find_rstudio_root_file())

load long table and metadata, merge

long_dt <- fread(sprintf("%s", long_table), sep = "\t", header = T) %>%
  select(sampleID, rel_abundance, lineage) %>%
  mutate(kingdom = case_when(grepl("k__Bac", lineage) ~ "Bacteria", 
                             grepl("k__Vir", lineage) ~ "Virus",
                             grepl("k__Ar", lineage) ~ "Archea",
                             grepl("k__Euk", lineage) ~ "Eukaryota",
                             TRUE ~ "other"))

meta_dt <- fread(sprintf("%s", metadata_table), sep = ",", header = T) %>%
  select(-V1)

mask_id_count <- meta_dt %>%
  group_by(mask_id) %>%
  filter(n() >= 10) %>%
  summarize(total_samples = n())

merge_dt <- merge(long_dt, meta_dt, by.x = "sampleID", by.y ="sample")

persist_dt <- merge_dt %>%
  group_by(lineage, kingdom, mask_id) %>%
  summarize(samples_detected = n())
`summarise()` has grouped output by 'lineage', 'kingdom'. You can override using the `.groups` argument.
persist_dt <- merge(persist_dt, mask_id_count, by = "mask_id") %>%
  mutate(frequency_detected = samples_detected/total_samples)

dim(
  persist_dt %>% distinct(mask_id)
)
[1] 555   1

calculate stats

stat.test <- persist_dt %>% 
  filter(kingdom == "Virus" | kingdom == "Bacteria") %>%
  wilcox_test(frequency_detected ~ kingdom) %>%
  add_significance(p.col="p", output.col="p.signif")
stat.test <- stat.test %>% add_xy_position(x = "kingdom")
stat.test
NA
NA

plot violins

persist_dt %>% 
  filter(kingdom == "Virus" | kingdom == "Bacteria") %>%
  ggplot(aes(factor(kingdom), frequency_detected)) + 
  geom_violin(aes(fill = factor(kingdom)),  alpha = 0.6, draw_quantiles = c(0.25, 0.75), linetype = "dotted") +
  geom_violin(fill="transparent", draw_quantiles = c(0.5)) +
  theme_bw() + 
  xlab("") + 
  ylab("SGB Intra-subject Frequency") + 
  scale_fill_manual(values=c("#F2AD00", "#5BBCD6")) + 
  stat_pvalue_manual(stat.test, label = "p.signif", y.position = 1.05, tip.length = 0.01, coord.flip = TRUE)  + 
  coord_flip() +
  theme(legend.position = "none")

ggsave(sprintf("%s/charts/bacteria_vs_virus_SGB_intrasubject_frequency1.pdf", find_rstudio_root_file()), width = 4, height = 3)


stat.test <- as.data.frame(stat.test)
stat.test <- apply(stat.test,2,as.character)
write.csv(stat.test, file=sprintf("%s/intermediate_files/wilcoxon_test_vOTU_bOTU_persist_10samps.csv", find_rstudio_root_file()))
LS0tCnRpdGxlOiAiaW50cmEtc3ViamVjdCBwZXJzaXN0ZW5jZSBmb3IgYmFjdGVyaWEgYW5kIHZpcnVzIFNHQnMiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmxvYWQgcGFja2FnZXMKYGBge3J9CmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShkYXRhLnRhYmxlKQpsaWJyYXJ5KHN0cmluZ3IpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkodGliYmxlKQpsaWJyYXJ5KGhlcmUpCmxpYnJhcnkocnByb2pyb290KQpsaWJyYXJ5KGdncHVicikKbGlicmFyeShyc3RhdGl4KQoKYGBgCgpzZXQgcGF0aHMgYW5kIGZpbGVuYW1lcwpgYGB7cn0KIyMjIGZpbGVzCmxvbmdfdGFibGU9c3ByaW50ZigiJXMvZGF0YS9tcDE0Ml9UR1ZHMS4xX01QQTRfY29tYmluZWRfYWJ1bmRhbmNlX3RhYmxlX2xvbmdmb3JtMS50c3YiLCBmaW5kX3JzdHVkaW9fcm9vdF9maWxlKCkpCm1ldGFkYXRhX3RhYmxlPXNwcmludGYoIiVzL2RhdGEvc29tZV90ZWRkeV9NUDE0Ml9tZXRhZGF0YTIuYWxsX3NhbXBsZXMxLmRlbGl2ZXJ5LmNzdiIsIGZpbmRfcnN0dWRpb19yb290X2ZpbGUoKSkKCmBgYAoKCmxvYWQgbG9uZyB0YWJsZSBhbmQgbWV0YWRhdGEsIG1lcmdlIApgYGB7cn0KbG9uZ19kdCA8LSBmcmVhZChzcHJpbnRmKCIlcyIsIGxvbmdfdGFibGUpLCBzZXAgPSAiXHQiLCBoZWFkZXIgPSBUKSAlPiUKICBzZWxlY3Qoc2FtcGxlSUQsIHJlbF9hYnVuZGFuY2UsIGxpbmVhZ2UpICU+JQogIG11dGF0ZShraW5nZG9tID0gY2FzZV93aGVuKGdyZXBsKCJrX19CYWMiLCBsaW5lYWdlKSB+ICJCYWN0ZXJpYSIsIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIGdyZXBsKCJrX19WaXIiLCBsaW5lYWdlKSB+ICJWaXJ1cyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZ3JlcGwoImtfX0FyIiwgbGluZWFnZSkgfiAiQXJjaGVhIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBncmVwbCgia19fRXVrIiwgbGluZWFnZSkgfiAiRXVrYXJ5b3RhIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICBUUlVFIH4gIm90aGVyIikpCgoKbWV0YV9kdCA8LSBmcmVhZChzcHJpbnRmKCIlcyIsIG1ldGFkYXRhX3RhYmxlKSwgc2VwID0gIiwiLCBoZWFkZXIgPSBUKSAlPiUKICBzZWxlY3QoLVYxKQoKbWFza19pZF9jb3VudCA8LSBtZXRhX2R0ICU+JQogIGdyb3VwX2J5KG1hc2tfaWQpICU+JQogIGZpbHRlcihuKCkgPj0gMTApICU+JQogIHN1bW1hcml6ZSh0b3RhbF9zYW1wbGVzID0gbigpKQoKbWVyZ2VfZHQgPC0gbWVyZ2UobG9uZ19kdCwgbWV0YV9kdCwgYnkueCA9ICJzYW1wbGVJRCIsIGJ5LnkgPSJzYW1wbGUiKQoKcGVyc2lzdF9kdCA8LSBtZXJnZV9kdCAlPiUKICBncm91cF9ieShsaW5lYWdlLCBraW5nZG9tLCBtYXNrX2lkKSAlPiUKICBzdW1tYXJpemUoc2FtcGxlc19kZXRlY3RlZCA9IG4oKSkKCnBlcnNpc3RfZHQgPC0gbWVyZ2UocGVyc2lzdF9kdCwgbWFza19pZF9jb3VudCwgYnkgPSAibWFza19pZCIpICU+JQogIG11dGF0ZShmcmVxdWVuY3lfZGV0ZWN0ZWQgPSBzYW1wbGVzX2RldGVjdGVkL3RvdGFsX3NhbXBsZXMpCgpkaW0oCiAgcGVyc2lzdF9kdCAlPiUgZGlzdGluY3QobWFza19pZCkKKQoKYGBgCgpjYWxjdWxhdGUgc3RhdHMKYGBge3J9CnN0YXQudGVzdCA8LSBwZXJzaXN0X2R0ICU+JSAKICBmaWx0ZXIoa2luZ2RvbSA9PSAiVmlydXMiIHwga2luZ2RvbSA9PSAiQmFjdGVyaWEiKSAlPiUKICB3aWxjb3hfdGVzdChmcmVxdWVuY3lfZGV0ZWN0ZWQgfiBraW5nZG9tKSAlPiUKICBhZGRfc2lnbmlmaWNhbmNlKHAuY29sPSJwIiwgb3V0cHV0LmNvbD0icC5zaWduaWYiKQpzdGF0LnRlc3QgPC0gc3RhdC50ZXN0ICU+JSBhZGRfeHlfcG9zaXRpb24oeCA9ICJraW5nZG9tIikKc3RhdC50ZXN0CgoKYGBgCgpwbG90IHZpb2xpbnMKYGBge3J9CnBlcnNpc3RfZHQgJT4lIAogIGZpbHRlcihraW5nZG9tID09ICJWaXJ1cyIgfCBraW5nZG9tID09ICJCYWN0ZXJpYSIpICU+JQogIGdncGxvdChhZXMoZmFjdG9yKGtpbmdkb20pLCBmcmVxdWVuY3lfZGV0ZWN0ZWQpKSArIAogIGdlb21fdmlvbGluKGFlcyhmaWxsID0gZmFjdG9yKGtpbmdkb20pKSwgIGFscGhhID0gMC42LCBkcmF3X3F1YW50aWxlcyA9IGMoMC4yNSwgMC43NSksIGxpbmV0eXBlID0gImRvdHRlZCIpICsKICBnZW9tX3Zpb2xpbihmaWxsPSJ0cmFuc3BhcmVudCIsIGRyYXdfcXVhbnRpbGVzID0gYygwLjUpKSArCiAgdGhlbWVfYncoKSArIAogIHhsYWIoIiIpICsgCiAgeWxhYigiU0dCIEludHJhLXN1YmplY3QgRnJlcXVlbmN5IikgKyAKICBzY2FsZV9maWxsX21hbnVhbCh2YWx1ZXM9YygiI0YyQUQwMCIsICIjNUJCQ0Q2IikpICsgCiAgc3RhdF9wdmFsdWVfbWFudWFsKHN0YXQudGVzdCwgbGFiZWwgPSAicC5zaWduaWYiLCB5LnBvc2l0aW9uID0gMS4wNSwgdGlwLmxlbmd0aCA9IDAuMDEsIGNvb3JkLmZsaXAgPSBUUlVFKSAgKyAKICBjb29yZF9mbGlwKCkgKwogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbiA9ICJub25lIikKCmdnc2F2ZShzcHJpbnRmKCIlcy9jaGFydHMvYmFjdGVyaWFfdnNfdmlydXNfU0dCX2ludHJhc3ViamVjdF9mcmVxdWVuY3kxLnBkZiIsIGZpbmRfcnN0dWRpb19yb290X2ZpbGUoKSksIHdpZHRoID0gNCwgaGVpZ2h0ID0gMykKCiMjIHNhdmUgc3RhdHMgdGFibGUKc3RhdC50ZXN0IDwtIGFzLmRhdGEuZnJhbWUoc3RhdC50ZXN0KQpzdGF0LnRlc3QgPC0gYXBwbHkoc3RhdC50ZXN0LDIsYXMuY2hhcmFjdGVyKQp3cml0ZS5jc3Yoc3RhdC50ZXN0LCBmaWxlPXNwcmludGYoIiVzL2ludGVybWVkaWF0ZV9maWxlcy93aWxjb3hvbl90ZXN0X3ZPVFVfYk9UVV9wZXJzaXN0XzEwc2FtcHMuY3N2IiwgZmluZF9yc3R1ZGlvX3Jvb3RfZmlsZSgpKSkKYGBgCgoKCgoKCgoKCg==